// **********************************************************************
// 
// <copyright>
// 
//  BBN Technologies
//  10 Moulton Street
//  Cambridge, MA 02138
//  (617) 873-8000
// 
//  Copyright (C) BBNT Solutions LLC. All rights reserved.
// 
// </copyright>
// **********************************************************************
// 
// $Source: /cvs/distapps/openmap/src/openmap/com/bbn/openmap/util/CSVTokenizer.java,v $
// $RCSfile: CSVTokenizer.java,v $
// $Revision: 1.6 $
// $Date: 2008/03/03 16:44:13 $
// $Author: dietrick $
// 
// **********************************************************************

package com.bbn.openmap.util;

/**
 * Tokenizer for comma separated values files, at least as generated by excel.
 * <p>
 * token() returns the next token, which can be either:
 * <ul>
 * <li>null, indicating an empty field.
 * <li>a Double, indicating a numeric field.
 * <li>a String, indicating an alphanumeric field.
 * <li>the NEWLINE object, indicating the end of a record.
 * <li>the EOF object, test with isEOF(), indicating the end of file.
 * </ul>
 */
public class CSVTokenizer extends Tokenizer {
    /** A flag the makes the tokenizer read numbers as strings. */
    boolean numberReadAsString = false;

    public CSVTokenizer(java.io.Reader in) {
        super(in);
    }

    /**
     * If you set numberReadAsString is true, then any number will be maintained
     * as a String.
     * 
     * @param in input Reader
     * @param numberReadAsString true if numbers should be interpreted to
     *        Strings
     */
    public CSVTokenizer(java.io.Reader in, boolean numberReadAsString) {
        super(in);
        this.numberReadAsString = numberReadAsString;
    }

    protected Object lastTokened = NEWLINE;

    /**
     * @return the next object read from the stream.
     */
    public Object token() {
        int c = next();
        Object ret = null;
        if (c == ',') {
            if (lastTokened == NEWLINE) {
                // Catch the first empty field on a new line.
                putback(c);
                ret = EMPTY;
            } else {
                ret = tokenAfterComma();
            }
        } else if (c == '\n')
            ret = NEWLINE;
        else if (c == '"')
            ret = tokenString(next());
        else if (c == '\\')
            ret = tokenString(c);
        else if ((c == '-' || c == '.' || isDigit(c)) && !numberReadAsString)
            ret = tokenNumber(c);
        else if (c == -1)
            ret = EOF;
        else
            ret = tokenAny(c);

        lastTokened = ret;
        
        return ret;
    }

    /**
     * Return the next object read from the stream, called if a comma is found
     * first in order to catch empty fields accurately.
     */
    protected Object tokenAfterComma() {
        int c = next();
        if (c == ',' || c == '\n') {
            putback(c);
            return EMPTY;
        } else if (c == '"')
            return tokenString(next());
        else if (c == '\\')
            return tokenString(c);
        else if ((c == '-' || c == '.' || isDigit(c)) && !numberReadAsString)
            return tokenNumber(c);
        else if (c == -1)
            return EOF;
        else
            return tokenAny(c);
    }

    /**
     * seq(is('"'), many(alt(seq(isNot('"')), bpush) <BR>
     * seq(is('"')),alt(seq(is('"'), bpush))),
     */

    Object tokenString(int c) {
        while (true) {
            // Enable escapes to force characters into string.
            if (c == '\\') {
                bpush(next());
                c = next();
            } else if (c == '"') {
                // Changed from the commented-out code below,
                // in order to ignore quotes in any order until
                // delimiter is reached. Quotes preceded by the
                // escape character live on in the string, via the
                // code above.
                c = next();
                if (isDelimiter(c))
                    return bclear();
                else
                    continue;

                // int c1 = next();
                // if (c1 == '"') {
                // bpush(c1);
                // c = next();
                // } else {
                // if (isDelimiter(c1)) {
                // return bclear();
                // } else {
                // return error("Expected Delimiter after string!");
                // }
                // }

            } else if (isAny(c)) {
                bpush(c);
                c = next();
            } else {
                return bclear();
            }
        }
    }

    /**
     * This checks for the delimiter at the end of a token. We assume it can
     * either be a ',' separating the next field, or '\n' indicating the end of
     * a field and the end of a record, so we putback(c).
     * <P>
     * isDelimiter.set(alt(is(','), is(-1), seq(is('\n'), putback)));
     */
    boolean isDelimiter(int c) {
        // All delimiters are handled equally now. We used to not put back
        // commas and EOF because it was more efficient to just return, but now
        // we putback and catch them in the token() call, in order to better
        // field empty fields, especially the ellusive common newline combo.
        if (c == ',' || c == -1 || c == '\n') {
            putback(c); // Wait for next token().
            return true;
        } else {
            return false;
        }
    }

    /**
     * Return a number or a string.
     */
    Object tokenNumber(int c) {
        Object result = tokenAny(c);
        try {
            Double d = new Double((String) result);
            return d;
        } catch (NumberFormatException e) {
            return result;
        }
    }

    /**
     * Return anything up to the next delimiter as a string.
     * tokenAny.set(alt(seq(isDelimiter, bclear), seq(bpush,tokenAny)))
     */
    Object tokenAny(int c) {
        while (true) {
            if (isDelimiter(c)) {
                return bclear();
            } else {
                bpush(c);
                c = next();
            }
        }
    }

    public static void main(String[] args) {
        try {
            CSVTokenizer csv = new
            // CSVTokenizer(new java.io.FileReader(args[0]));
            CSVTokenizer(new java.io.BufferedReader(new java.io.FileReader(args[0])));
            // new java.io.InputStreamReader
            // (new java.io.FileInputStream(args[0]))));
            while (true) {
                Object token = csv.token();
                if (csv.isEOF(token)) {
                    csv.close();
                    return;
                }
                System.out.println(token);
            }
        } catch (Exception e) {
            System.out.println(e);
        }
    }
}